# county-level socialeconomic information
county_data <- fread("data/covid_county.csv")
# county-level COVID case and death
covid_rate <- fread("data/covid_rates.csv")
# county-level lockdown dates
covid_intervention <- fread("data/covid_intervention.csv")view(dfSummary(covid_rate),method = "render")
view(dfSummary(county_data),method = "render")# unique(covid_rate$State)
covid.3state.day = covid_rate[State %in% c("New York","Washington","Florida"),.(
cum_cases = sum(cum_cases),
cum_deaths = sum(cum_deaths),
week = mean(week)
),by = .(date,State)]
covid.3state.day = covid.3state.day[order(list(State,date))]
covid.3state.day[,new_cases := c(NA,diff(cum_cases)),by = State]
covid.3state.day[,date := ymd(date)]
ggplot(covid.3state.day,aes(x=date,y=new_cases,group=State,color=State))+
geom_line()+
scale_x_date(date_breaks = "3 month",date_labels = "%Y-%m")+
labs(title = "New Cases Trends for NY, WA and FL")+
theme_bw()The biggest problem: daily variability is extremely high, making the trends zigzagging. Maybe there is an innate difference among days in a week because people have different work and life style in different days. We may wish to smooth out these excessive noise by aggregating in the weekly level.
# check
covid_rate[State == "New York",length(County),by = .(State,date)] # problem: some counties do not report cases in some days. New added counties cause problems## State date V1
## 1: New York 2020-03-12 17
## 2: New York 2020-03-13 18
## 3: New York 2020-03-14 21
## 4: New York 2020-03-15 25
## 5: New York 2020-03-16 28
## ---
## 354: New York 2020-03-07 9
## 355: New York 2020-03-08 11
## 356: New York 2020-03-09 11
## 357: New York 2020-03-10 11
## 358: New York 2020-03-11 12
We find that the number of counties reporting cases every day is incresaing. We may want to adjust the total population accordingly.
covid.weekend = covid_rate[,.(
cum_cases_weekend = cum_cases[length(cum_cases)],
cum_deaths_weekend = cum_deaths[length(cum_deaths)],
TotalPopEst2019.weekend = TotalPopEst2019[length(TotalPopEst2019)]
), by = .(week,State,County)]
covid.weekend = covid.weekend[order(list(State,County,week))]
covid.weekend[,":="(new_cases = c(NA,diff(cum_cases_weekend)),
new_deaths = c(NA,diff(cum_deaths_weekend))),
by = .(State,County)]
covid.week = covid.weekend[!is.na(new_cases)&!is.na(new_deaths),.(
new_cases = sum(new_cases),
new_deaths = sum(new_deaths),
TotalPopEst2019 = sum(TotalPopEst2019.weekend)
),by = .(State,week)]
covid.week[,weekly_case_per100k := new_cases/TotalPopEst2019*100000]
# data.ma = covid_rate[State == "Massachusetts"] # pay attention: week 33 of MA
ggplot(covid.week,aes(x=week,y=weekly_case_per100k,group=State,color=State))+
geom_line()+
labs(title = "Spaghetti Plots, Weekly New Cases")+
theme_bw()# summary(covid_intervention)
# some examples
covid_intervention[STATE == "New York"]## Empty data.table (0 rows and 16 cols): FIPS,STATE,AREA_NAME,stay at home,>50 gatherings,>500 gatherings...
covid_intervention_state = covid_intervention[substr(FIPS,nchar(FIPS)-2,nchar(FIPS)) == "000"]
# NY
ggplot(covid.3state.day[State == "New York"],aes(x=date,y=new_cases))+
geom_line()+
scale_x_date(date_breaks = "3 month",date_labels = "%Y-%m")+
geom_vline(xintercept = covid_intervention_state[STATE == "NY","stay at home"][[1,1]],color = "red",lty = 5)+
labs(title = "New Cases Trends for NY")+
theme_bw()# FL
ggplot(covid.3state.day[State == "Florida"],aes(x=date,y=new_cases))+
geom_line()+
scale_x_date(date_breaks = "3 month",date_labels = "%Y-%m")+
geom_vline(xintercept = covid_intervention_state[STATE == "FL","stay at home"][[1,1]],color = "red",lty = 5)+
labs(title = "New Cases Trends for FL")+
theme_bw()The graphs for these two states indicate that the lockdown policy may have some positive effects on slowing down the spread of the virus.
# pay attention to numbers smaller than zero
covid_rate = covid_rate[order(FIPS,date)]
covid_rate[,":="(new_cases = c(NA,diff(cum_cases)),
new_deaths = c(NA,diff(cum_deaths)),
year = year(date),
month = month(date),
year.month = round_date(date,"month")),by = FIPS]
covid.month = covid_rate[!is.na(new_cases)&!is.na(new_deaths),.(
new_cases = sum(new_cases),
new_deaths = sum(new_deaths),
TotalPopEst2019 = TotalPopEst2019[length(TotalPopEst2019)]
),by = .(State,County,year,month)]
covid.month = covid.month[,.(
new_cases = sum(new_cases),
new_deaths = sum(new_deaths),
TotalPopEst2019 = sum(TotalPopEst2019)
),by = .(State,year,month)]
covid.month[,monthly_death_per100k := new_deaths/TotalPopEst2019*100000]Here we only give one example: 2020-9. The plots for all months are shown in (ii).
covid.death.plot.list = list()
setnames(covid.month,"State","state")
max_col = quantile(covid.month$monthly_death_per100k,1,na.rm = T)
min_col = quantile(covid.month$monthly_death_per100k,0,na.rm = T)
for (i in 2:12) {
covid.death.plot.list[[i-1]] =
plot_usmap(regions = "state",data = covid.month[year == 2020 & month == i],
values = "monthly_death_per100k", exclude = c("Hawaii", "Alaska"),color = "black") +
scale_fill_distiller(
palette = "Reds",direction = 1,
name = "Number of New Covid Deaths per 100,000 People",
limits = c(min_col, max_col)) +
labs(title = paste0("New Covid Deaths, 2020-",i), subtitle = "Continental US States") +
theme(legend.position = "right")
}
ggplotly(covid.death.plot.list[[8]])# plot_usmap(regions = "state",data = covid.month[year == 2020 & month == i],
# values = "monthly_death_per100k", exclude = c("Hawaii", "Alaska"),color = "black") +
# scale_fill_gradient(
# low = "white", high = "red",
# name = "Number of New Covid Deaths per 100,000 People",
# label = scales::comma) +
# labs(title = paste0("New Covid Deaths, 2020-",i), subtitle = "Continental US States") +
# theme(legend.position = "right")# ggplotly
test.long = covid.month[year == 2020,.(month,state,monthly_death_per100k)]
# test = test.long %>%
# pivot_wider(names_from = month,values_from = monthly_death_per100k) %>%
# mutate(state = tolower(state))
#
# map.wide = test %>%
# left_join(map_data("state"),by = c("state" = "region"))
# map.wide = data.table(map.wide)
# map = map.wide[order(order)] %>%
# pivot_longer(cols = 2:13,names_to = "month",values_to = "monthly_death_per100k") %>%
# mutate(month = as.numeric(month))
# map = data.table(map)
# map = map[order(list(month,order))]
#
# plot.test = ggplot(map,aes(x=long,y=lat,group=group))+
# geom_polygon(aes(fill = monthly_death_per100k))+
# geom_path()+
# scale_fill_distiller(palette = "Reds", direction = 1,
# name = "Num",
# limits = c(min_col, max_col))+
# labs(title = paste0("New Monthly Covid Deaths per 100k"), subtitle = "Continental US States")+
# theme_bw()
# ggplotly(plot.test) # error?
# plotly
abbr = unique(us_map(regions =
"states") %>% select(abbr, full))
plotly.data = covid.month[year == 2020,.(month,state,monthly_death_per100k)] %>%
mutate(hover = paste(state, '<br>',
'new covid deaths', round(monthly_death_per100k, 3))) %>%
left_join(abbr,by = c("state" = "full"))
# give state boundaries a white border
l <- list(color = toRGB("white"), width = 2)
# specify some map projection/options
g <- list(
scope = 'usa',
projection = list(type = 'albers usa'),
showlakes = TRUE,
lakecolor = toRGB('white'),
exclude = c("")
)
fig <- plot_geo(plotly.data, locationmode = 'USA-states')
fig <- fig %>% add_trace(
z = ~monthly_death_per100k, text = ~hover, locations = ~abbr,
color = ~monthly_death_per100k, colors = 'Reds',
zmin = min_col, zmax = max_col,frame = ~month
)
fig <- fig %>% colorbar(title = "Monthly new covid deaths of state")
fig <- fig %>% layout(
title = 'Monthly new covid deaths of state',
geo = g,
hoverlabel = list(bgcolor="white")
)
figsave.image("codes/hw3_covid.RData")load("codes/hw3_covid.RData")county.covid = covid_rate[date == as.Date("2021-02-01"),.(
FIPS,County,State,
total_death_per100k = cum_deaths/TotalPopEst2019*100000
)] %>%
right_join(county_data, by = "FIPS") %>%
mutate(log_total_death_per100k = log(total_death_per100k + 1))
county.covid.sub <- county.covid %>%
select(log_total_death_per100k,State.x,County.x, FIPS, Deep_Pov_All, PovertyAllAgesPct, PerCapitaInc, UnempRate2019, PctEmpFIRE, PctEmpConstruction, PctEmpTrans, PctEmpMining, PctEmpTrade, PctEmpInformation, PctEmpAgriculture, PctEmpManufacturing, PctEmpServices, PopDensity2010, OwnHomePct, Age65AndOlderPct2010, TotalPop25Plus, Under18Pct2010, Ed2HSDiplomaOnlyPct, Ed3SomeCollegePct, Ed4AssocDegreePct, Ed5CollegePlusPct, ForeignBornPct, Net_International_Migration_Rate_2010_2019, NetMigrationRate1019, NaturalChangeRate1019, TotalPopEst2019, WhiteNonHispanicPct2010, NativeAmericanNonHispanicPct2010, BlackNonHispanicPct2010, AsianNonHispanicPct2010, HispanicPct2010, Type_2015_Update, RuralUrbanContinuumCode2013, UrbanInfluenceCode2013, Perpov_1980_0711, HiCreativeClass2000, HiAmenity, Retirement_Destination_2015_Update)
setnames(county.covid.sub,c("State.x","County.x"),c("state","county"))Num of missings in every variable
# county.covid.sub[is.na(state)]
apply(is.na(county.covid.sub), 2, sum) # missing in state, county: all aggregate states, puerto rico, hawaii, alaska; bedford in Virginia has two obs, duplicated## log_total_death_per100k
## 171
## state
## 171
## county
## 171
## FIPS
## 0
## Deep_Pov_All
## 6
## PovertyAllAgesPct
## 86
## PerCapitaInc
## 6
## UnempRate2019
## 7
## PctEmpFIRE
## 7
## PctEmpConstruction
## 7
## PctEmpTrans
## 7
## PctEmpMining
## 7
## PctEmpTrade
## 7
## PctEmpInformation
## 7
## PctEmpAgriculture
## 7
## PctEmpManufacturing
## 7
## PctEmpServices
## 7
## PopDensity2010
## 6
## OwnHomePct
## 6
## Age65AndOlderPct2010
## 6
## TotalPop25Plus
## 6
## Under18Pct2010
## 6
## Ed2HSDiplomaOnlyPct
## 6
## Ed3SomeCollegePct
## 6
## Ed4AssocDegreePct
## 6
## Ed5CollegePlusPct
## 6
## ForeignBornPct
## 85
## Net_International_Migration_Rate_2010_2019
## 85
## NetMigrationRate1019
## 85
## NaturalChangeRate1019
## 85
## TotalPopEst2019
## 6
## WhiteNonHispanicPct2010
## 6
## NativeAmericanNonHispanicPct2010
## 6
## BlackNonHispanicPct2010
## 6
## AsianNonHispanicPct2010
## 6
## HispanicPct2010
## 6
## Type_2015_Update
## 136
## RuralUrbanContinuumCode2013
## 57
## UrbanInfluenceCode2013
## 57
## Perpov_1980_0711
## 136
## HiCreativeClass2000
## 140
## HiAmenity
## 172
## Retirement_Destination_2015_Update
## 136
county.covid.sub = na.omit(county.covid.sub) # -174set.seed(1)
model.var = model.matrix(log_total_death_per100k~.,data = county.covid.sub[,-c("FIPS","county")])[,-1] # no Alabama here
# head(model.test)
death = county.covid.sub[,log_total_death_per100k]
fit1.cv = cv.glmnet(model.var,death,alpha = 1, nfolds = 10, intercept = T,
penalty.factor = c(rep(0,48),rep(1,ncol(model.var)-48))) # alpha: the para in elastic net
coef.min = coef(fit1.cv,s="lambda.1se") # more parsimonious usually. If use min, maybe there is noise
# coef(fit1.cv,s="lambda.1min")
nonzero.coef = coef.min[which(coef.min!=0),]
# plot(fit1.cv)
nonzero.var = names(nonzero.coef)[-1]Relaxed lasso result
data.selected = data.table(death,model.var[,which(colnames(model.var) %in% nonzero.var)])
fit.1se.lm = lm(death~.,data = data.selected)
# relaxed lasso
stargazer(fit.1se.lm,type=output_format, align=TRUE)| Dependent variable: | |
| death | |
| stateArizona | 0.267 |
| (0.238) | |
| stateArkansas | 0.056 |
| (0.135) | |
| stateCalifornia | -0.771*** |
| (0.159) | |
| stateColorado | -0.357** |
| (0.153) | |
| stateConnecticut | 0.178 |
| (0.303) | |
| stateDelaware | -0.024 |
| (0.469) | |
stateDistrict of Columbia
|
0.520 |
| (0.807) | |
| stateFlorida | 0.031 |
| (0.147) | |
| stateGeorgia | 0.066 |
| (0.117) | |
| stateIdaho | -0.322* |
| (0.166) | |
| stateIllinois | 0.211 |
| (0.132) | |
| stateIndiana | -0.074 |
| (0.134) | |
| stateIowa | 0.287** |
| (0.134) | |
| stateKansas | -0.628*** |
| (0.137) | |
| stateKentucky | -0.756*** |
| (0.128) | |
| stateLouisiana | 0.375*** |
| (0.143) | |
| stateMaine | -1.370*** |
| (0.225) | |
| stateMaryland | -0.059 |
| (0.196) | |
| stateMassachusetts | 0.077 |
| (0.240) | |
| stateMichigan | -0.147 |
| (0.135) | |
| stateMinnesota | -0.157 |
| (0.138) | |
| stateMississippi | 0.264** |
| (0.132) | |
| stateMissouri | -0.378*** |
| (0.129) | |
| stateMontana | 0.547*** |
| (0.158) | |
| stateNebraska | -0.386*** |
| (0.143) | |
| stateNevada | -0.577** |
| (0.227) | |
stateNew Hampshire
|
-0.806*** |
| (0.273) | |
stateNew Jersey
|
0.329 |
| (0.209) | |
stateNew Mexico
|
-0.653*** |
| (0.192) | |
stateNew York
|
-0.367** |
| (0.151) | |
stateNorth Carolina
|
-0.368*** |
| (0.127) | |
stateNorth Dakota
|
0.950*** |
| (0.166) | |
| stateOhio | -0.523*** |
| (0.134) | |
| stateOklahoma | -0.374*** |
| (0.137) | |
| stateOregon | -0.865*** |
| (0.174) | |
| statePennsylvania | 0.039 |
| (0.146) | |
stateRhode Island
|
-0.177 |
| (0.372) | |
stateSouth Carolina
|
-0.011 |
| (0.153) | |
stateSouth Dakota
|
0.809*** |
| (0.151) | |
| stateTennessee | 0.058 |
| (0.130) | |
| stateTexas | 0.168 |
| (0.127) | |
| stateUtah | -1.090*** |
| (0.196) | |
| stateVermont | -2.140*** |
| (0.239) | |
| stateVirginia | -0.476*** |
| (0.123) | |
| stateWashington | -0.857*** |
| (0.168) | |
stateWest Virginia
|
-0.678*** |
| (0.155) | |
| stateWisconsin | -0.134 |
| (0.140) | |
| stateWyoming | 0.208 |
| (0.205) | |
| PovertyAllAgesPct | 0.003 |
| (0.005) | |
| PerCapitaInc | -0.00001 |
| (0.00001) | |
| PctEmpConstruction | -0.049*** |
| (0.007) | |
| PctEmpMining | -0.019*** |
| (0.006) | |
| PctEmpAgriculture | -0.039*** |
| (0.004) | |
| PctEmpManufacturing | 0.003 |
| (0.003) | |
| PopDensity2010 | 0.00003*** |
| (0.00001) | |
| Age65AndOlderPct2010 | 0.034*** |
| (0.008) | |
| Under18Pct2010 | 0.060*** |
| (0.008) | |
| Ed3SomeCollegePct | -0.021*** |
| (0.005) | |
| Ed5CollegePlusPct | -0.011*** |
| (0.004) | |
| NetMigrationRate1019 | -0.012*** |
| (0.003) | |
| NaturalChangeRate1019 | -0.044*** |
| (0.010) | |
| WhiteNonHispanicPct2010 | -0.003* |
| (0.002) | |
| HispanicPct2010 | 0.009*** |
| (0.002) | |
| Type_2015_Update | -0.019** |
| (0.009) | |
| Constant | 4.530*** |
| (0.417) | |
| Observations | 3,105 |
| R2 | 0.359 |
| Adjusted R2 | 0.345 |
| Residual Std. Error | 0.790 (df = 3040) |
| F Statistic | 26.600*** (df = 64; 3040) |
| Note: | p<0.1; p<0.05; p<0.01 |
BIC graphs
fit.final.1 = regsubsets(death~.,data = data.selected,method = "exhaustive",
nvmax = ncol(data.selected)-1,force.in = c(1:48),really.big = T) # compared to Arizona
summary.fit.final.1 = summary(fit.final.1)
plot(summary.fit.final.1$bic)opt.index = 10 # with bicWe choose \(p=10\) by BIC criteria.
Final model after fine tuning
bic.var.select = summary.fit.final.1$which[opt.index,-1]
bic.var = names(bic.var.select)[which(bic.var.select)]
# bic.var
final.expr = as.formula(paste("death", "~", paste(bic.var, collapse = "+")))
fit.final.2 = lm(final.expr,data = data.selected)
stargazer(fit.final.2,type=output_format, align=TRUE)| Dependent variable: | |
| death | |
| stateArizona | 0.208 |
| (0.237) | |
| stateArkansas | 0.021 |
| (0.134) | |
| stateCalifornia | -0.863*** |
| (0.155) | |
| stateColorado | -0.434*** |
| (0.150) | |
| stateConnecticut | 0.070 |
| (0.299) | |
| stateDelaware | -0.099 |
| (0.469) | |
stateDistrict of Columbia
|
0.657 |
| (0.804) | |
| stateFlorida | -0.016 |
| (0.142) | |
| stateGeorgia | 0.053 |
| (0.116) | |
| stateIdaho | -0.399** |
| (0.163) | |
| stateIllinois | 0.107 |
| (0.127) | |
| stateIndiana | -0.183 |
| (0.128) | |
| stateIowa | 0.178 |
| (0.129) | |
| stateKansas | -0.706*** |
| (0.133) | |
| stateKentucky | -0.841*** |
| (0.121) | |
| stateLouisiana | 0.371*** |
| (0.141) | |
| stateMaine | -1.490*** |
| (0.222) | |
| stateMaryland | -0.157 |
| (0.191) | |
| stateMassachusetts | -0.005 |
| (0.238) | |
| stateMichigan | -0.237* |
| (0.132) | |
| stateMinnesota | -0.268** |
| (0.133) | |
| stateMississippi | 0.309** |
| (0.131) | |
| stateMissouri | -0.459*** |
| (0.123) | |
| stateMontana | 0.485*** |
| (0.155) | |
| stateNebraska | -0.482*** |
| (0.138) | |
| stateNevada | -0.647*** |
| (0.226) | |
stateNew Hampshire
|
-0.950*** |
| (0.271) | |
stateNew Jersey
|
0.254 |
| (0.203) | |
stateNew Mexico
|
-0.725*** |
| (0.190) | |
stateNew York
|
-0.429*** |
| (0.142) | |
stateNorth Carolina
|
-0.387*** |
| (0.126) | |
stateNorth Dakota
|
0.848*** |
| (0.161) | |
| stateOhio | -0.632*** |
| (0.129) | |
| stateOklahoma | -0.400*** |
| (0.136) | |
| stateOregon | -0.932*** |
| (0.172) | |
| statePennsylvania | -0.068 |
| (0.140) | |
stateRhode Island
|
-0.287 |
| (0.370) | |
stateSouth Carolina
|
-0.001 |
| (0.152) | |
stateSouth Dakota
|
0.738*** |
| (0.148) | |
| stateTennessee | -0.003 |
| (0.128) | |
| stateTexas | 0.119 |
| (0.126) | |
| stateUtah | -1.200*** |
| (0.188) | |
| stateVermont | -2.280*** |
| (0.236) | |
| stateVirginia | -0.516*** |
| (0.121) | |
| stateWashington | -0.923*** |
| (0.167) | |
stateWest Virginia
|
-0.792*** |
| (0.148) | |
| stateWisconsin | -0.252* |
| (0.136) | |
| stateWyoming | 0.106 |
| (0.203) | |
| PctEmpConstruction | -0.057*** |
| (0.007) | |
| PctEmpMining | -0.024*** |
| (0.005) | |
| PctEmpAgriculture | -0.041*** |
| (0.003) | |
| Age65AndOlderPct2010 | 0.032*** |
| (0.008) | |
| Under18Pct2010 | 0.060*** |
| (0.007) | |
| Ed3SomeCollegePct | -0.024*** |
| (0.005) | |
| Ed5CollegePlusPct | -0.016*** |
| (0.002) | |
| NetMigrationRate1019 | -0.014*** |
| (0.002) | |
| NaturalChangeRate1019 | -0.040*** |
| (0.010) | |
| HispanicPct2010 | 0.011*** |
| (0.002) | |
| Constant | 4.520*** |
| (0.267) | |
| Observations | 3,105 |
| R2 | 0.354 |
| Adjusted R2 | 0.342 |
| Residual Std. Error | 0.792 (df = 3046) |
| F Statistic | 28.800*** (df = 58; 3046) |
| Note: | p<0.1; p<0.05; p<0.01 |
# all significantAge65AnOlderPct2010 has a significantly positive coefficient. However, Under18Pct2010 also has a significantly positive coefficient and is larger than that for elderly. This does not give strong support to the argument that covid affects the elderly the most; we would rather interpret it as, covid affect the middle ages the least.
BlackPct is not in the regression after controlling for existing variables while HispanicPct is in the regression. The coefficient for it is significantly positive, indicating that a higher Hispanic percentage in the region is connected with a higher fatal rate of covid. The analysis gives some support on a higher fatal rate in Hispanic group; it is not very clear for the black group.
scatter.list = list()
for (i in 49:58) { # cannot do this. the plot is built only after it's invoked if in the loop? Use aes_string
name = bic.var[i]
scatter.list[[i-48]] = ggplot(data.selected,aes_string(x=name,y="death"))+
geom_point()+
geom_smooth(method = "lm")+
xlab(name)+
ylab("log.new.death")+
theme_bw()
}Scatter Plots
plot_grid(plotlist = scatter.list)# hist(county.covid$total_death_per100k,breaks = seq(0,900,10))Residual Plot
# residual plot
plot(fit.final.2,1)QQ Plot
# qq plot
plot(fit.final.2,2)Seems not a good fit. Homoscedasticity is not well satisfied from the residual plot; from the residual plot we can see that the residuals have much thicker tails than the normal distribution.
From the scatter plots presented above, we can see that there are a lot of counties with zero in log total covid death per 100k and they are distant to other observations. Therefore, we may consider mixture models (for example, zero-inflated model) and classify them into two groups first, then make inference within every group.
As with the possible important variables, medical conditions could be one of them. Also total number of death per 100k may not be a perfect measure for our goal because it contains a part of randomness, i.e., the spread of covid in an area can have some random determinants; the virus might unexpectedly break out in some areas and result in a higher infection rate and mortality rate. We may want to complement the analysis with another dependent variable like \(\frac{TotalDeaths}{TotalCases}\).
Missing values are clustered in Puerto Rico, Alaska. These states are different in property with continent states so we may not trust the imputation results.